# Import Libraries to Carry out the Analysis
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
plt.style.use('ggplot')


# Import netflix dataset into pandas dataframe
df_netflix = pd.read_csv('netflix.csv', delimiter = ",")


# Showing first 5 rows of netflix dataframe
df_netflix.head()


# Exploring the dataset
df_netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2370 entries, 0 to 2369
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    2370 non-null   int64 
 1   type          2370 non-null   object
 2   title         2370 non-null   object
 3   director      1737 non-null   object
 4   cast          2085 non-null   object
 5   country       2370 non-null   object
 6   release_year  2370 non-null   int64 
 7   duration      2367 non-null   object
 8   listed_in     2370 non-null   object
 9   description   2370 non-null   object
 10  year_added    2370 non-null   int64 
 11  month_added   2370 non-null   int64 
 12  main_genre    2370 non-null   object
dtypes: int64(4), object(9)
memory usage: 240.8+ KB


# Viewing descriptive statistics of the dataset
df_netflix.describe()


# Showing the number of rows and columns
df_netflix.shape

(2370, 13)


# Checking for missing values in the dataset
percent_missing = df_netflix.isnull().sum() * 100 / len(df_netflix) 
percent_missing = round(percent_missing, 2)
percent_missing = percent_missing.astype(str) + '%'
percent_missing

Unnamed: 0        0.0%
type              0.0%
title             0.0%
director        26.71%
cast            12.03%
country           0.0%
release_year      0.0%
duration         0.13%
listed_in         0.0%
description       0.0%
year_added        0.0%
month_added       0.0%
main_genre        0.0%
dtype: object


# Checking for duplicate rows in the dataset
dup_percentage = df_netflix.duplicated().sum()/len(df_netflix)*100
dup_percentage = round(dup_percentage,2)
dup_percentage = dup_percentage.astype(str) + '%'
dup_percentage

'0.0%'


# Renaming unnamed column to movie id
df_netflix.rename(columns = {'Unnamed: 0':'movie_id'}, inplace = True)


# We have missing values in three columns (Director, Cast and Duration). 
# The only way to deal with these missing values is to drop them
# Hence Dropping rows with null/missing values
df_netflix.dropna(inplace=True)


# Checking for missing values in the dataset after removing null values
percent_missing = df_netflix.isnull().sum() * 100 / len(df_netflix) 
percent_missing = round(percent_missing, 2)
percent_missing = percent_missing.astype(str) + '%'
percent_missing

movie_id        0.0%
type            0.0%
title           0.0%
director        0.0%
cast            0.0%
country         0.0%
release_year    0.0%
duration        0.0%
listed_in       0.0%
description     0.0%
year_added      0.0%
month_added     0.0%
main_genre      0.0%
dtype: object


# Converting df to csv for Streamlit
df_netflix.to_csv('netflix_cleaned.csv')


# Creating a new dataframe to show the amount of content added each year since 2012
df_content = df_netflix.groupby(['year_added'],as_index=False).count() 
df_content = df_content[df_content['year_added']>=2012]
df_content.head()


# Creating a Linechart to show our results
fig = go.Figure()
fig = px.line(df_content, x="year_added", y="title")
fig.update_layout(title="Amount of Content Added in Netflix Since 2012",
                  xaxis_title="Year",
                  yaxis_title="Amount of Content")
fig.data[0].line.color = "red"
fig.update_xaxes(type='category')
fig.show()


# Filtering Dataframe to show content according to its type
df_films = df_netflix[df_netflix['type'] == 'Movie']
df_series = df_netflix[df_netflix['type'] == 'TV Show']


# Grouping Series dataframe to count the number of content added each year since 2012
df_content_series = df_series.groupby(['year_added'],as_index=False).count() #Using groupby function to create required pivot table
df_content_series = df_content_series[df_content_series['year_added']>=2012]


# Creating a barchart to show the results
fig = go.Figure()
fig = px.bar(df_content_series, x="year_added", y="title")
fig.update_layout(title="Amount of Series Added in Netflix Since 2012",
                  xaxis_title="Year",
                  yaxis_title="Amount of Series")
fig.update_xaxes(type='category')
fig.show()


# Grouping Movies dataframe to count number of movies added since 2012
df_content_movies = df_films.groupby(['year_added'],as_index=False).count() #Using groupby function to create required pivot table
df_content_movies = df_content_movies[df_content_movies['year_added']>=2012]


# Creating barchart to compare the results
fig = go.Figure()
fig = px.bar(df_content_movies, x="year_added", y="title")
fig.update_layout(title="Amount of Movies Added in Netflix Since 2012",
                  xaxis_title="Year",
                  yaxis_title="Amount of Movies")
fig.update_xaxes(type='category')
fig.show()


# Creating a new dataframe to count number of content added each month for each type
df_season = df_netflix.groupby(['month_added','type'],as_index=False).count()
grouped_pivot = df_season.pivot(index='month_added',columns='type',values='title') #Creating Pivot Table
grouped_pivot = grouped_pivot.fillna(0)
grouped_pivot.head()


# Creating bar chart to compare the amount of content added each month
grouped_pivot.plot(kind='barh',figsize=(15,10))
plt.title('Number of Movies and Series Added Each Month',fontsize = 20)
plt.xlabel('Number of Content',fontsize = 15)
plt.ylabel('Month',fontsize = 15)
plt.legend(fontsize = 10)
plt.show()


# Creating a new dataframe to see the number of content added for horror movie genre
df_horror = df_netflix[df_netflix['main_genre'] == 'Horror Movies']
df_horror1 = df_horror.groupby(['month_added'],as_index=False).count()
df_horror1.head()


# Plotting a linegraph to show the number of content added each month
fig = go.Figure()
fig = px.line(df_horror1, x="month_added", y="title")
fig.update_layout(title="Horror Movies Added Each Month",
                  xaxis_title="Month",
                  yaxis_title="Number of Horror Movies")
fig.data[0].line.color = "orange"
fig.update_xaxes(type='category')
fig.show()


# Creating a new dataframe to extract the Top 10 genres that is added in netflix
df_genre = df_netflix.groupby(['main_genre'],as_index=False).count()
df_genre = df_genre.sort_values(by='title', ascending=False)
df_genre = df_genre.head(10)
df_genre.head()


# Showing the results using a donut Chart
colors = ['blue','red','lightblue','orange']
fig = go.Figure(data = go.Pie(values = df_genre['title'], 
                          labels = df_genre['main_genre'], hole = 0.4,
                          title = 'Top 10 Genres in Netflix',
                          marker_colors = colors
                 ))
fig.update_traces(
                   title_font=dict(size=25,family='Verdana', 
                                   color='black'),
                                   hoverinfo='label+percent',
                                   textinfo='percent', 
                                   textfont_size=10,
                   )
fig.show()

	Unnamed: 0	type	title	director	cast	country	release_year	duration	listed_in	description	year_added	month_added	main_genre
0	428	Movie	Alaska Is a Drag	Shaz Bennett	Martin L. Washington Jr., Maya Washington, Mat...	United States	2017	83 min	Dramas, LGBTQ Movies	Tormented by bullies, an aspiring drag star wo...	2020	12	Dramas
1	429	TV Show	Chilling Adventures of Sabrina	NaN	Kiernan Shipka, Ross Lynch, Miranda Otto, Lucy...	United States	2020	4 Seasons	TV Horror, TV Mysteries, TV Sci-Fi & Fantasy	Magic and mischief collide as half-human, half...	2020	12	TV Horror
2	430	TV Show	Best Leftovers Ever!	NaN	Jackie Tohn, David So, Rosemary Shrager	United States	2020	1 Season	Reality TV	Decadent pasta from day-old fries? Skillful co...	2020	12	Reality TV
3	431	TV Show	Dare Me	NaN	Willa Fitzgerald, Herizen Guardiola, Marlo Kel...	United States	2019	1 Season	Crime TV Shows, TV Dramas, TV Thrillers	Relationships topple and loyalties flip when a...	2020	12	Crime TV Shows
4	432	Movie	Cops and Robbers	Arnon Manor, Timothy Ware-Hill	Timothy Ware-Hill	United States	2020	8 min	Dramas	Animation and activism unite in this multimedi...	2020	12	Dramas

	Unnamed: 0	release_year	year_added	month_added
count	2370.000000	2370.000000	2370.000000	2370.000000
mean	1616.802532	2013.588186	2018.366667	6.828270
std	689.640705	10.041914	1.553923	3.624008
min	428.000000	1942.000000	2008.000000	1.000000
25%	1020.250000	2013.000000	2017.000000	4.000000
50%	1612.500000	2017.000000	2019.000000	7.000000
75%	2212.750000	2019.000000	2020.000000	10.000000
max	2817.000000	2021.000000	2020.000000	12.000000

	year_added	movie_id	type	title	director	cast	country	release_year	duration	listed_in	description	month_added	main_genre
4	2012	2	2	2	2	2	2	2	2	2	2	2	2
5	2013	5	5	5	5	5	5	5	5	5	5	5	5
6	2014	10	10	10	10	10	10	10	10	10	10	10	10
7	2015	35	35	35	35	35	35	35	35	35	35	35	35
8	2016	95	95	95	95	95	95	95	95	95	95	95	95

type	Movie	TV Show
month_added
1	168.0	0.0
2	101.0	2.0
3	130.0	1.0
4	112.0	1.0
5	91.0	3.0

	main_genre	movie_id	type	title	director	cast	country	release_year	duration	listed_in	description	year_added	month_added
8	Dramas	316	316	316	316	316	316	316	316	316	316	316	316
3	Comedies	270	270	270	270	270	270	270	270	270	270	270	270
1	Children & Family Movies	211	211	211	211	211	211	211	211	211	211	211	211
17	Stand-Up Comedy	196	196	196	196	196	196	196	196	196	196	196	196
6	Documentaries	182	182	182	182	182	182	182	182	182	182	182	182

Netflix Movie Data Analysis
¶

Dataset Used: Netflix.csv¶

Data Description:¶

Tasks¶

1. Get familiar with the data. What information does the dataset contain? Is the dataset complete?¶

First Impressions of the Dataset:¶

2. Clean the data set. The goal of this section is to have a dataset that does not contain any missing entries. Think about how to best deal with the missing values.¶

3. Carry out an analysis of the dataset. Answer the following questions as graphically as possible:¶

3.1 How has the amount of content (series and films) added each year developed since 2012?¶

Graph Description:¶

3.2 What about series or films individually? Is there a difference between the two formats?¶

Comparison:¶

3.3 Are the same number of films and series added every season (every month)?¶

Conclusion:¶

3.4 What about the horror movie genre? Is there a trend around Halloween (October)?¶

Conclusion:¶

3.5 Additional results and insights have a positive effect on the rating, depending on the information content.¶

Chart Description:¶

	month_added	movie_id	type	title	director	cast	country	release_year	duration	listed_in	description	year_added	main_genre
0	1	14	14	14	14	14	14	14	14	14	14	14	14
1	2	8	8	8	8	8	8	8	8	8	8	8	8
2	3	3	3	3	3	3	3	3	3	3	3	3	3
3	4	7	7	7	7	7	7	7	7	7	7	7	7
4	5	4	4	4	4	4	4	4	4	4	4	4	4

	month_added	movie_id	type	title	director	cast	country	release_year	duration	listed_in	description	year_added	main_genre
0	1	14	14	14	14	14	14	14	14	14	14	14	14
1	2	8	8	8	8	8	8	8	8	8	8	8	8
2	3	3	3	3	3	3	3	3	3	3	3	3	3
3	4	7	7	7	7	7	7	7	7	7	7	7	7
4	5	4	4	4	4	4	4	4	4	4	4	4	4

Netflix Movie Data Analysis ¶

Dataset Used: Netflix.csv¶

Data Description:¶

Tasks¶

1. Get familiar with the data. What information does the dataset contain? Is the dataset complete?¶

First Impressions of the Dataset:¶

2. Clean the data set. The goal of this section is to have a dataset that does not contain any missing entries. Think about how to best deal with the missing values.¶

3. Carry out an analysis of the dataset. Answer the following questions as graphically as possible:¶

3.1 How has the amount of content (series and films) added each year developed since 2012?¶

Graph Description:¶

3.2 What about series or films individually? Is there a difference between the two formats?¶

Comparison:¶

3.3 Are the same number of films and series added every season (every month)?¶

Conclusion:¶

3.4 What about the horror movie genre? Is there a trend around Halloween (October)?¶

Conclusion:¶

3.5 Additional results and insights have a positive effect on the rating, depending on the information content.¶

Chart Description:¶

Netflix Movie Data Analysis
¶

	month_added	movie_id	type	title	director	cast	country	release_year	duration	listed_in	description	year_added	main_genre
0	1	14	14	14	14	14	14	14	14	14	14	14	14
1	2	8	8	8	8	8	8	8	8	8	8	8	8
2	3	3	3	3	3	3	3	3	3	3	3	3	3
3	4	7	7	7	7	7	7	7	7	7	7	7	7
4	5	4	4	4	4	4	4	4	4	4	4	4	4